The tidymodels framework is a collection of packages for modeling and machine learning using tidyverse principles.

Code

library(tidymodels)  
library(tidyverse)  
# Helper packages
library(readr)       # for importing data
library(vip)         # for variable importance plots

1 input data

Code

hotels <- 
  read_csv("https://tidymodels.org/start/case-study/hotels.csv") %>%
  mutate(across(where(is.character), as.factor))

Code

dim(hotels)

[1] 50000    23

Code

glimpse(hotels)

Rows: 50,000
Columns: 23
$ hotel                          <fct> City_Hotel, City_Hotel, Resort_Hotel, R…
$ lead_time                      <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6…
$ stays_in_weekend_nights        <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, …
$ stays_in_week_nights           <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, …
$ adults                         <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, …
$ children                       <fct> none, none, none, none, none, none, chi…
$ meal                           <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB,…
$ country                        <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP,…
$ market_segment                 <fct> Offline_TA/TO, Direct, Online_TA, Onlin…
$ distribution_channel           <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, …
$ assigned_room_type             <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, …
$ booking_changes                <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <fct> No_Deposit, No_Deposit, No_Deposit, No_…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <fct> Transient-Party, Transient, Transient, …
$ average_daily_rate             <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.…
$ required_car_parking_spaces    <fct> none, none, none, none, none, none, non…
$ total_of_special_requests      <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, …
$ arrival_date                   <date> 2016-09-01, 2017-08-25, 2016-11-19, 20…

target: have chilren/no chilren

Code

hotels %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  4038 0.0808
2 none     45962 0.919

2 DATA SPLITTING & RESAMPLING

reserve 25% of the stays to the test set

Code

set.seed(123)
splits      <- initial_split(hotels, strata = children)

hotel_other <- training(splits)
hotel_test  <- testing(splits)

Code

# training set proportions by children
hotel_other %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  3027 0.0807
2 none     34473 0.919

Code

# test set proportions by children
hotel_test  %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  1011 0.0809
2 none     11489 0.919

Code

set.seed(234)
val_set <- validation_split(hotel_other, 
                            strata = children, 
                            prop = 0.80)
val_set

# Validation Set Split (0.8/0.2)  using stratification 
# A tibble: 1 × 2
  splits               id        
  <list>               <chr>     
1 <split [30000/7500]> validation

3 recipe

Code

rf_recipe <- 
  recipe(children ~ ., data = hotel_other) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date) %>% 
  step_rm(arrival_date)

4 model

Code

cores <- parallel::detectCores()
cores

[1] 8

random forest model with 8 cores

Code

rf_mod <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger", num.threads = cores) %>% 
  set_mode("classification")

5 workflow

Code

rf_workflow <- 
  workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(rf_recipe)

6 tuning

Code

rf_mod

Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = 1000
  min_n = tune()

Engine-Specific Arguments:
  num.threads = cores

Computational engine: ranger

Code

extract_parameter_set_dials(rf_mod)

Collection of 2 parameters for tuning

 identifier  type    object
       mtry  mtry nparam[?]
      min_n min_n nparam[+]

Model parameters needing finalization:
   # Randomly Selected Predictors ('mtry')

See `?dials::finalize` or `?dials::update.parameters` for more information.

7 Train and tune the model

Code

print('start run')

[1] "start run"

Code

a=Sys.time()
print(a)

[1] "2023-07-14 02:59:44 CST"

Code

set.seed(345)
rf_res <- 
  rf_workflow %>% 
  tune_grid(val_set,
            grid = 25,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(roc_auc))

print('finish run')

[1] "finish run"

Code

b=Sys.time()
print(b)

[1] "2023-07-14 03:02:07 CST"

Code

print('total time')

[1] "total time"

Code

print(b-a)

Time difference of 2.378564 mins

8 result

Code

rf_res %>% 
  show_best(metric = "roc_auc")

# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     8     7 roc_auc binary     0.926     1      NA Preprocessor1_Model13
2    12     7 roc_auc binary     0.926     1      NA Preprocessor1_Model01
3    13     4 roc_auc binary     0.925     1      NA Preprocessor1_Model05
4     9    12 roc_auc binary     0.924     1      NA Preprocessor1_Model19
5     6    18 roc_auc binary     0.924     1      NA Preprocessor1_Model24

Code

autoplot(rf_res)

Code

rf_best <- 
  rf_res %>% 
  select_best(metric = "roc_auc")
rf_best

# A tibble: 1 × 3
   mtry min_n .config              
  <int> <int> <chr>                
1     8     7 Preprocessor1_Model13

Code

rf_res %>% 
  collect_predictions()

# A tibble: 187,500 × 8
   id         .pred_children .pred_none  .row  mtry min_n children .config      
   <chr>               <dbl>      <dbl> <int> <int> <int> <fct>    <chr>        
 1 validation        0.152        0.848    13    12     7 none     Preprocessor…
 2 validation        0.0302       0.970    20    12     7 none     Preprocessor…
 3 validation        0.513        0.487    22    12     7 children Preprocessor…
 4 validation        0.0103       0.990    23    12     7 none     Preprocessor…
 5 validation        0.0111       0.989    31    12     7 none     Preprocessor…
 6 validation        0            1        38    12     7 none     Preprocessor…
 7 validation        0            1        39    12     7 none     Preprocessor…
 8 validation        0.00325      0.997    50    12     7 none     Preprocessor…
 9 validation        0.0241       0.976    54    12     7 none     Preprocessor…
10 validation        0.0441       0.956    57    12     7 children Preprocessor…
# ℹ 187,490 more rows

Code

rf_auc <- 
  rf_res %>% 
  collect_predictions(parameters = rf_best) %>% 
  roc_curve(children, .pred_children) %>% 
  mutate(model = "Random Forest")

top penalized logistic regression model

Code

lr_mod <- 
  logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", 
              "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")

lr_recipe <- 
  recipe(children ~ ., data = hotel_other) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date, holidays = holidays) %>% 
  step_rm(arrival_date) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors())


lr_workflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(lr_recipe)


lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))


lr_res <- 
  lr_workflow %>% 
  tune_grid(val_set,
            grid = lr_reg_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(roc_auc))
lr_best <- 
  lr_res %>% 
  collect_metrics() %>% 
  arrange(penalty) %>% 
  slice(12)

lr_auc <- 
  lr_res %>% 
  collect_predictions(parameters = lr_best) %>% 
  roc_curve(children, .pred_children) %>% 
  mutate(model = "Logistic Regression")

Code

bind_rows(rf_auc, lr_auc) %>% 
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) + 
  geom_path(lwd = 1.5, alpha = 0.8) +
  geom_abline(lty = 3) + 
  coord_equal() + 
  scale_color_viridis_d(option = "plasma", end = .6)

9 last fit

Code

# the last model
last_rf_mod <- 
  rand_forest(mtry = 8, min_n = 7, trees = 1000) %>% 
  set_engine("ranger", num.threads = cores, importance = "impurity") %>% 
  set_mode("classification")

# the last workflow
last_rf_workflow <- 
  rf_workflow %>% 
  update_model(last_rf_mod)

# the last fit
set.seed(345)
last_rf_fit <- 
  last_rf_workflow %>% 
  last_fit(splits)

last_rf_fit

# Resampling results
# Manual resampling 
# A tibble: 1 × 6
  splits                id             .metrics .notes   .predictions .workflow 
  <list>                <chr>          <list>   <list>   <list>       <list>    
1 <split [37500/12500]> train/test sp… <tibble> <tibble> <tibble>     <workflow>

Code

last_rf_fit %>% 
  collect_metrics()

# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.946 Preprocessor1_Model1
2 roc_auc  binary         0.923 Preprocessor1_Model1

Code

last_rf_fit %>% 
  extract_fit_parsnip() %>% 
  vip(num_features = 20)

Code

last_rf_fit %>% 
  collect_predictions() %>% 
  roc_curve(children, .pred_children) %>% 
  autoplot()

10 output model

11 Reference

https://www.tidymodels.org/start/case-study/